<?php
/**
* DOMDocumentWrapper class simplifies work with DOMDocument.
*
* Know bug:
* - in XHTML fragments, <br /> changes to <br clear="none" />
*
* @todo check XML catalogs compatibility
* @author Tobiasz Cudnik <tobiasz.cudnik/gmail.com>
*/
class DOMDocumentWrapper {
/**
* @var DOMDocument
*/
public $document;
/**
* @todo Rewrite as method and quess if null.
* @var unknown_type
*/
public $contentType = '';
public $xpath;
public $events = array();
public $eventsGlobal = array();
/**
* TODO
* @var unknown_type
*/
public $frames = array();
/**
* Document root, by default equals to document itself.
* Used by documentFragments.
*
* @var DOMNode
*/
public $root;
public $isDocumentFragment = null;
public $isXML = false;
public $isXHTML = false;
public $isHTML = false;
public $charset;
public function __construct($markup = null, $contentType = null, $newDocumentID = null) {
if (isset($markup))
$this->load($markup, $contentType, $newDocumentID);
}
public function load($markup, $contentType = null, $newDocumentID = null) {
$id = $newDocumentID
? $newDocumentID
: md5(microtime());
if ($markup instanceof DOMDOCUMENT) {
// TODO: integrate passed DOMDocument object into wrapper
}
phpQuery::$documents[$id] = $this;
$this->contentType = strtolower($contentType);
if ($this->loadMarkup($markup)) {
$this->xpath = new DOMXPath($this->document);
// remember last loaded document
return phpQuery::$defaultDocumentID = $id;
}
}
protected function loadMarkup($markup) {
$loaded = false;
if ($this->contentType) {
// content determined by contentType
list($contentType, $charset) = $this->contentTypeToArray($this->contentType);
switch($contentType) {
case 'text/html':
$loaded = $this->loadMakrupHTML($markup, $charset);
break;
case 'text/xml':
case 'application/xhtml+xml':
$loaded = $this->loadMakrupXML($markup, $charset);
break;
default:
// for feeds or anything that sometimes doesn't use text/xml
if (strpos('xml', $this->contentType) !== false)
$loaded = $this->loadMakrupXML($markup, $charset);
else
phpQuery::debug("Could not determine document type from content type '{$this->contentType}'");
}
} else {
// content type autodetection
if ($this->isXML($markup)) {
$loaded = $this->loadMakrupXML($markup);
if (! $loaded && $this->isXHTML) {
phpQuery::debug('Loading as XML failed, trying to load as HTML');
$loaded = $this->loadMakrupHTML($markup);
}
} else {
$loaded = $this->loadMakrupHTML($markup);
}
}
return $loaded;
}
protected function loadMarkupReset() {
$this->isXML = $this->isXHTML = $this->isHTML = false;
}
protected function documentCreate($charset, $version = '1.0') {
if (! $version)
$version = '1.0';
if ($this->document)
// TODO unload
;
$this->document = new DOMDocument($version, $charset);
$this->charset = $this->document->encoding;
// $this->document->encoding = $charset;
$this->document->formatOutput = true;
$this->document->preserveWhiteSpace = true;
}
protected function loadMakrupHTML($markup, $requestedCharset = null) {
if (phpQuery::$debug)
phpQuery::debug('Full markup load (HTML): '.substr($markup, 0, 250));
$this->loadMarkupReset();
$this->isHTML = true;
if (!isset($this->isDocumentFragment))
$this->isDocumentFragment = self::isDocumentFragmentHTML($markup);
$charset = null;
$documentCharset = $this->charsetFromHTML($markup);
if ($documentCharset) {
$charset = $documentCharset;
} else if ($requestedCharset) {
$charset = $requestedCharset;
}
if (! $charset)
$charset = phpQuery::$defaultCharset;
if ($requestedCharset && $documentCharset && $requestedCharset != $documentCharset) {
// TODO place for charset conversion
// $charset = $requestedCharset;
}
$return = false;
if ($this->isDocumentFragment) {
phpQuery::debug("Full markup load (HTML), DocumentFragment detected, using charset '$charset'");
$return = $this->documentFragmentLoadMarkup($this, $charset, $markup);
} else {
if (! $documentCharset) {
phpQuery::debug("Full markup load (HTML), appending charset '$charset'");
$markup = $this->charsetAppendToHTML($markup, $charset);
} else {
$charset = $documentCharset;
phpQuery::debug("Full markup load (HTML), using document's charset '{$charset}'");
}
// TODO: check if mb_convert_encoding is really needed
// $html = mb_convert_encoding($html, 'HTML-ENTITIES', self::$defaultEncoding);
// $html = '<meta http-equiv="Content-Type" content="text/html;charset='.self::$defaultEncoding.'">'.$html;
$this->documentCreate($charset);
$return = phpQuery::$debug === 2
? $this->document->loadHTML($markup)
: @$this->document->loadHTML($markup);
if ($return)
$this->root = $this->document;
}
if ($return && ! $this->contentType)
// TODO quess content type
;
return $return;
}
protected function loadMakrupXML($markup, $requestedCharset = null) {
if (phpQuery::$debug)
phpQuery::debug('Full markup load (XML): '.substr($markup, 0, 250));
$this->loadMarkupReset();
$this->isXML = true;
// check agains XHTML in contentType or markup
$isContentTypeXHTML = $this->isXHTML();
$isMarkupXHTML = $this->isXHTML($markup);
if ($isContentTypeXHTML || $isMarkupXHTML)
$this->isXHTML = true;
// determine document fragment
if (!isset($this->isDocumentFragment))
$this->isDocumentFragment = $this->isXHTML
? self::isDocumentFragmentXHTML($markup)
: self::isDocumentFragmentXML($markup);
// this charset will be used
$charset = null;
// charset from XML declaration @var string
$documentCharset = $this->charsetFromXML($markup);
if (! $documentCharset) {
if ($this->isXHTML) {
// this is XHTML, try to get charset from content-type meta header
$documentCharset = $this->charsetFromHTML($markup);
if ($documentCharset) {
phpQuery::debug("Full markup load (XML), appending XHTML charset '$documentCharset'");
$this->charsetAppendToXML($markup, $documentCharset);
$charset = $documentCharset;
}
}
if (! $documentCharset) {
// if still no document charset...
$charset = $requestedCharset;
}
} else if ($requestedCharset) {
$charset = $requestedCharset;
}
if (! $charset) {
$charset = phpQuery::$defaultCharset;
}
if ($requestedCharset && $documentCharset && $requestedCharset != $documentCharset) {
// TODO place for charset conversion
// $charset = $requestedCharset;
}
$return = false;
if ($this->isDocumentFragment) {
phpQuery::debug("Full markup load (XML), DocumentFragment detected, using charset '$charset'");
$return = $this->documentFragmentLoadMarkup($this, $charset, $markup);
} else {
if ($isContentTypeXHTML && ! $isMarkupXHTML)
if (! $documentCharset) {
phpQuery::debug("Full markup load (XML), appending charset '$charset'");
$markup = $this->charsetAppendToXML($markup, $charset);
}
// see http://pl2.php.net/manual/en/book.dom.php#78929
// LIBXML_DTDLOAD (>= PHP 5.1)
// does XML ctalogues works with LIBXML_NONET
// $this->document->resolveExternals = true;
// TODO test LIBXML_COMPACT for performance improvement
// create document
$this->documentCreate($charset);
if (phpversion() < 5.1) {
$this->document->resolveExternals = true;
$return = phpQuery::$debug === 2
? $this->document->loadXML($markup)
: @$this->document->loadXML($markup);
} else {
/** @link http://pl2.php.net/manual/en/libxml.constants.php */
$libxmlStatic = phpQuery::$debug === 2
? LIBXML_DTDLOAD|LIBXML_DTDATTR|LIBXML_NONET
: LIBXML_DTDLOAD|LIBXML_DTDATTR|LIBXML_NONET|LIBXML_NOWARNING|LIBXML_NOERROR;
$return = $this->document->loadXML($markup, $libxmlStatic);
}
if ($return)
$this->root = $this->document;
}
if ($return && ! $this->contentType)
// TODO quess content type
;
return $return;
}
protected function isXHTML($markup = null) {
if (! isset($markup)) {
return strpos($this->contentType, 'xhtml') !== false;
}
// XXX ok ?
return strpos($markup, "<!DOCTYPE html") !== false;
// return stripos($doctype, 'xhtml') !== false;
// $doctype = isset($dom->doctype) && is_object($dom->doctype)
// ? $dom->doctype->publicId
// : self::$defaultDoctype;
}
protected function isXML($markup) {
// return strpos($markup, '<?xml') !== false && stripos($markup, 'xhtml') === false;
return strpos($markup, '<'.'?xml') !== false;
}
protected function contentTypeToArray($contentType) {
$matches = explode(';', trim(strtolower($contentType)));
if (isset($matches[1])) {
$matches[1] = explode('=', $matches[1]);
// strip 'charset='
$matches[1] = isset($matches[1][1]) && trim($matches[1][1])
? $matches[1][1]
: $matches[1][0];
} else
$matches[1] = null;
return $matches;
}
/**
*
* @param $markup
* @return array contentType, charset
*/
protected function contentTypeFromHTML($markup) {
$matches;
// find meta tag
preg_match('@<meta[^>]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i',
$markup, $matches
);
if (! isset($matches[0]))
return array(null, null);
// get attr 'content'
preg_match('@content\\s*=\\s*(["|\'])(.+?)\\1@', $matches[0], $matches);
if (! isset($matches[0]))
return array(null, null);
return $this->contentTypeToArray($matches[2]);
}
protected function charsetFromHTML($markup) {
$contentType = $this->contentTypeFromHTML($markup);
return $contentType[1];
// if ( $html instanceof DOMNODE || is_array($html) ) {
// $loop = $html instanceof DOMNODELIST || is_array($html)
// ? $html
// : array($html);
// foreach( $loop as $node ) {
// if (! $node instanceof DOMELEMENT )
// continue;
// $isEncoding = isset($node->tagName) && $node->tagName == 'meta'
// && strtolower($node->getAttribute('http-equiv')) == 'content-type';
// if ($isEncoding)
// return true;
// foreach( $node->getElementsByTagName('meta') as $node )
// if ( strtolower($node->getAttribute('http-equiv')) == 'content-type' )
// return true;
// }
// } else
// return preg_match('@<meta\\s+http-equiv\\s*=\\s*(["|\'])Content-Type\\1@i', $html);
}
protected function charsetFromXML($markup) {
$matches;
// find declaration
preg_match('@<'.'?xml[^>]+encoding\\s*=\\s*(["|\'])(.*?)\\1@i',
$markup, $matches
);
return isset($matches[2])
? strtolower($matches[2])
: null;
}
protected function charsetAppendToHTML($html, $charset, $xhtml = false) {
$meta = '<meta http-equiv="Content-Type" content="text/html;charset='
.$charset.'" '
.($xhtml ? '/' : '')
.'>';
if (strpos($html, '<head') === false) {
if (strpos($html, '<html') === false) {
return $meta.$html;
} else {
return preg_replace(
'@<html(.*?)(?(?<!\?)>)@s',
"<html\\1><head>{$meta}</head>",
$html
);
}
} else {
return preg_replace(
'@<head(.*?)(?(?<!\?)>)@s',
'<head\\1>'.$meta,
$html
);
}
}
protected function charsetAppendToXML($markup, $charset) {
$declaration = '<'.'?xml version="1.0" encoding="'.$charset.'"?'.'>';
return $declaration.$markup;
}
public static function isDocumentFragmentHTML($markup) {
return stripos($markup, '<html') === false;
}
public static function isDocumentFragmentXML($markup) {
return stripos($markup, '<'.'?xml') === false;
}
public static function isDocumentFragmentXHTML($markup) {
return self::isDocumentFragmentHTML($markup);
}
public function importAttr($value) {
// TODO
}
/**
*
* @param $source
* @param $target
* @param $sourceCharset
* @return array Array of imported nodes.
*/
public function import($source, $sourceCharset = null) {
// TODO charset conversions
$return = array();
if ($source instanceof DOMNODE && !($source instanceof DOMNODELIST))
$source = array($source);
if (is_array($source) || $source instanceof DOMNODELIST) {
// dom nodes
foreach($source as $node)
$return[] = $this->document->importNode($node, true);
} else {
// string markup
$fake = $this->documentFragmentCreate($source, $sourceCharset);
return $this->import($fake->root->childNodes);
}
return $return;
}
/**
* Creates new document fragment.
*
* @param $source
* @return DOMDocumentWrapper
*/
protected function documentFragmentCreate($source, $charset = null) {
$fake = new DOMDocumentWrapper();
$fake->contentType = $this->contentType;
$fake->root = $fake->document;
if (! $charset)
$charset = $this->charset;
// $fake->documentCreate($this->charset);
if ($source instanceof DOMNODE && !($source instanceof DOMNODELIST))
$source = array($source);
if (is_array($source) || $source instanceof DOMNODELIST) {
// dom nodes
// load fake document
$this->documentFragmentLoadMarkup($fake, $charset);
$nodes = $fake->import($source);
foreach($nodes as $node)
$fake->root->appendChild($node);
} else {
// string markup
$this->documentFragmentLoadMarkup($fake, $charset, $source);
}

return $fake;
}
/**
*
* @param $document DOMDocumentWrapper
* @param $markup
* @return $document
*/
private function documentFragmentLoadMarkup($fragment, $charset, $markup = null) {
// TODO error handling
// TODO copy doctype
// tempolary turn off
$fragment->isDocumentFragment = false;
if ($fragment->isXML) {
if ($fragment->isXHTML) {
// add FAKE element to set default namespace
$fragment->loadMakrupXML('<?xml version="1.0" encoding="'.$charset.'"?>'
.'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" '
.'"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'
.'<fake xmlns="http://www.w3.org/1999/xhtml">'.$markup.'</fake>');
} else {
$fragment->loadMakrupXML('<?xml version="1.0" encoding="'.$charset.'"?><fake>'.$markup.'</fake>');
}
$fragment->root = $fragment->document->firstChild;
} else {
$markup2 = '<html><head><meta http-equiv="Content-Type" content="text/html;charset='
.$charset.'"></head>';
$noBody = strpos($markup, '<body') === false;
if ($noBody)
$markup2 .= '<body>';
$markup2 .= $markup;
if ($noBody)
$markup2 .= '</body>';
$markup2 .= '</html>';
$fragment->loadMakrupHTML($markup2);
// TODO resolv body tag merging issue
$fragment->root = $noBody
? $fragment->document->firstChild->nextSibling->firstChild->nextSibling
: $fragment->document->firstChild->nextSibling->firstChild->nextSibling;
}
$fragment->isDocumentFragment = true;
return true;
}
protected function documentFragmentToMarkup($fragment) {
phpQuery::debug('documentFragmentToMarkup');
$tmp = $fragment->isDocumentFragment;
$fragment->isDocumentFragment = false;
$markup = $fragment->markup();
if ($fragment->isXML) {
$markup = substr($markup, 0, strrpos($markup, '</fake>'));
if ($fragment->isXHTML) {
$markup = substr($markup, strpos($markup, '<fake')+43);
} else {
$markup = substr($markup, strpos($markup, '<fake>')+5);
}
} else {
$markup = substr($markup, strpos($markup, '<body>')+6);
$markup = substr($markup, 0, strrpos($markup, '</body>'));
}
$fragment->isDocumentFragment = $tmp;
if (phpQuery::$debug)
phpQuery::debug('documentFragmentToMarkup: '.substr($markup, 0, 150));
return $markup;
}
/**
* Return document markup, starting with optional $node as root.
*
* @param $node DOMNode|DOMNodeList
* @return string
*/
public function markup($nodes = null, $innerMarkup = false) {
if (isset($nodes) && count($nodes) == 1 && $nodes[0] instanceof DOMDOCUMENT)
$nodes = null;
if (isset($nodes)) {
$markup = '';
if (!is_array($nodes) && !($nodes instanceof DOMNODELIST) )
$nodes = array($nodes);
if ($this->isXML && ! $innerMarkup) {
self::debug("Getting outerXML with charset '{$this->charset}'");
// we need outerXML, so we can benefit from
// $node param support in saveXML()
foreach($nodes as $node)
$markup .= $this->document->saveXML($node);
} else {
$loop = array();
if ($innerMarkup)
foreach($nodes as $node)
foreach($node->childNodes as $child)
$loop[] = $child;
else
$loop = $nodes;
self::debug("Getting markup, moving selected nodes (".count($loop).") to new DocumentFragment");
$fake = $this->documentFragmentCreate($loop);
$markup = $this->documentFragmentToMarkup($fake);
}
if ($this->isXHTML) {
self::debug("Fixing XHTML");
$markup = self::markupFixXHTML($markup);
}
self::debug("Markup: ".substr($markup, 0, 250));
return $markup;
} else {
if ($this->isDocumentFragment) {
// documentFragment, html only...
self::debug("Getting markup, DocumentFragment detected");
// return $this->markup(
//// $this->document->getElementsByTagName('body')->item(0)
// $this->document->root, true
// );
$markup = $this->documentFragmentToMarkup($this);
// no need for markupFixXHTML, as it's done thought markup($nodes) method
return $markup;
} else {
self::debug("Getting markup (".($this->isXML?'XML':'HTML')."), final with charset '{$this->charset}'");
$markup = $this->isXML
? $this->document->saveXML()
: $this->document->saveHTML();
if ($this->isXHTML) {
self::debug("Fixing XHTML");
$markup = self::markupFixXHTML($markup);
}
self::debug("Markup: ".substr($markup, 0, 250));
return $markup;
}
}
}
protected static function markupFixXHTML($markup) {
$markup = self::expandEmptyTag('script', $markup);
$markup = self::expandEmptyTag('select', $markup);
$markup = self::expandEmptyTag('textarea', $markup);
return $markup;
}
public static function debug($text) {
phpQuery::debug($text);
}
/**
* expandEmptyTag
*
* @param $tag
* @param $xml
* @return unknown_type
* @author mjaque at ilkebenson dot com
* @link http://pl2.php.net/manual/en/domdocument.savehtml.php#81256
*/
public static function expandEmptyTag($tag, $xml){
$indice = 0;
while ($indice< strlen($xml)){
$pos = strpos($xml, "<$tag ", $indice);
if ($pos){
$posCierre = strpos($xml, ">", $pos);
if ($xml[$posCierre-1] == "/"){
$xml = substr_replace($xml, "></$tag>", $posCierre-1, 2);
}
$indice = $posCierre;
}
else break;
}
return $xml;
}
}